See the source for details
library('poppr')
## Loading required package: adegenet
## Loading required package: ade4
## ==========================
## adegenet 2.0.0 is loaded
## ==========================
##
## - to start: type '?adegenet'
## - to browse the adegenet website: type 'adegenetWeb()'
## - to post questions/comments: adegenet-forum@lists.r-forge.r-project.org
## - to report bugs, request features, contribute: http://goo.gl/dZuu5X
##
##
## This is poppr version 1.1.4.99.407. To get started, type package?poppr
library('ape')
library('phangorn')
library('animation')
infdat <- RCurl::getURL("https://raw.githubusercontent.com/grunwaldlab/phytophthora_id/master/shiny-server/www/genoid_infestans/reduced_database.txt.csv")
infdat <- read.table(text = infdat, header = TRUE)
pinf <- df2genind(infdat[-c(1,2)], sep = "/", ploidy = 3, ind.names = infdat[[1]], pop = infdat[[2]])
ssr <- c(3,3,2,3,3,2,2,3,3,3,3,3)
x <- as.genclone(pinf)
fstats <- filter_stats(x, bruvo.dist, plot = TRUE, replen = ssr, loss = FALSE, nclone = 18)
title(main = expression(paste(italic("P. infestans"), " reference isolates (12 SSR loci)")))
# #
# tiff(filename = "images/pinf_cluster.tiff", width = 85, height = 68, units = "mm", res = 1200, pointsize = 6)
# fstats <- filter_stats(x, bruvo.dist, plot = TRUE, replen = ssr, loss = FALSE, nclone = 18)
# title(main = expression(paste(italic("P. infestans"), " reference isolates (12 SSR loci)")))
# dev.off()
# legend("topright", legend = c("Nearest Neighbor", "UPGMA", "Farthest Neighbor"),
# col = c("red", "black", "blue"), pch = 1, title = "Clustering Method")
The plot above shows how multilocus genotypes collapse under differing algorithms over genetic distance.
Below, we will collapse MLGs with a threshold of an average of 2 mutational steps over all loci and create contingency tables relating the clustered MLGs to the previously defined MLGs (eg. US-8).
z <- filter_stats(x, bruvo.dist, replen = ssr, loss = FALSE, threshold = 0.75/12, stats = "MLGS")
print(table(pop(x), z$farthest), zero.print = ".")
##
## 3 4 5 6 8 9 10 12 15 16 17 18 20 21 22 24 25 27 28
## B . . . . . . . . . . . . . . . . 1 . .
## C . . . . . . . . . . . . . . . 1 . . .
## D.1 . . . . . . . . . . . . . . 1 . . . .
## D.2 . . . . . . . . . . . . . . 1 . . . .
## EU-13 . . . . . . . . . 1 . . . . . . . . .
## EU-4 . . . . . . . . . . 1 . . . . . . . .
## EU-5 . . . . . . . . . . . 2 . . . . . . .
## EU-8 . . . . . . . 1 . . . . . . . . . . .
## US-11 . . . . . . . . . . . . . . . . . . 2
## US-12 . 1 . . . . . . . . . . . . . . . . .
## US-14 . . . . . 1 . . . . . . . . . . . . .
## US-17 . . . . . . . . . . . . 1 . . . . . .
## US-20 2 . . . . . . . . . . . . . . . . . .
## US-21 . . . . . . . . . . . . . . . . . 2 .
## US-22 . . . . . . . . . . . . . 2 . . . . .
## US-23 . . . . . . . . 3 . . . . . . . . . .
## US-24 . . . . 3 . . . . . . . . . . . . . .
## US-8 . . 1 1 . . 2 . . . . . . . . . . . .
print(table(pop(x), z$nearest), zero.print = ".")
##
## 3 4 5 6 8 10 12 14 16 17 18 20 21 22 24 25 27 28
## B . . . . . . . . . . . . . . . 1 . .
## C . . . . . . . . . . . . . . 1 . . .
## D.1 . . . . . . . . . . . . . 1 . . . .
## D.2 . . . . . . . . . . . . . 1 . . . .
## EU-13 . . . . . . . . 1 . . . . . . . . .
## EU-4 . . . . . . . . . 1 . . . . . . . .
## EU-5 . . . . . . . . . . 2 . . . . . . .
## EU-8 . . . . . . 1 . . . . . . . . . . .
## US-11 . . . . . . . . . . . . . . . . . 2
## US-12 . 1 . . . . . . . . . . . . . . . .
## US-14 . . . . . 1 . . . . . . . . . . . .
## US-17 . . . . . . . . . . . 1 . . . . . .
## US-20 2 . . . . . . . . . . . . . . . . .
## US-21 . . . . . . . . . . . . . . . . 2 .
## US-22 . . . . . . . . . . . . 2 . . . . .
## US-23 . . . . . . . 3 . . . . . . . . . .
## US-24 . . . . 3 . . . . . . . . . . . . .
## US-8 . . 1 1 . 2 . . . . . . . . . . . .
tab <- mlg.table(x, bar = FALSE)
colnames(tab) <- 1:ncol(tab)
print.table(tab, zero.print = ".") # contingency table for zero tolerance MLGs.
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26
## B . . . . . . . . . . . . . . . . . . . . . . . . 1 .
## C . . . . . . . . . . . . . . . . . . . . . . . 1 . .
## D.1 . . . . . . . . . . . . . . . . . . . . . 1 . . . .
## D.2 . . . . . . . . . . . . . . . . . . . . . . 1 . . .
## EU-13 . . . . . . . . . . . . . . . 1 . . . . . . . . . .
## EU-4 . . . . . . . . . . . . . . . . 1 . . . . . . . . .
## EU-5 . . . . . . . . . . . . . . . . . 1 1 . . . . . . .
## EU-8 . . . . . . . . . . . 1 . . . . . . . . . . . . . .
## US-11 . . . . . . . . . . . . . . . . . . . . . . . . . .
## US-12 . . . 1 . . . . . . . . . . . . . . . . . . . . . .
## US-14 . . . . . . . . 1 . . . . . . . . . . . . . . . . .
## US-17 . . . . . . . . . . . . . . . . . . . 1 . . . . . .
## US-20 . 1 1 . . . . . . . . . . . . . . . . . . . . . . .
## US-21 . . . . . . . . . . . . . . . . . . . . . . . . . 1
## US-22 1 . . . . . . . . . . . . . . . . . . . 1 . . . . .
## US-23 . . . . . . . . . . . . 1 1 1 . . . . . . . . . . .
## US-24 . . . . . . 1 2 . . . . . . . . . . . . . . . . . .
## US-8 . . . . 1 1 . . . 1 1 . . . . . . . . . . . . . . .
## 27 28 29
## B . . .
## C . . .
## D.1 . . .
## D.2 . . .
## EU-13 . . .
## EU-4 . . .
## EU-5 . . .
## EU-8 . . .
## US-11 . 1 1
## US-12 . . .
## US-14 . . .
## US-17 . . .
## US-20 . . .
## US-21 1 . .
## US-22 . . .
## US-23 . . .
## US-24 . . .
## US-8 . . .
Most of the MLGs were able to be resolved. US-8 and D.1 and D.2 are not exactly resolved, but that is shown in the trees produced from the commented code below.
# Uncomment to regenerate plots
#
# for (i in names(fstats)){
# HTML_collapse(measure = i, x = x, treefun = "upgma",
# distfun = "bruvo.dist", fstats = fstats, destdir = "images",
# replen = ssr, loss = FALSE)
# HTML_collapse(measure = i, x = x, treefun = "nj",
# distfun = "bruvo.dist", fstats = fstats, destdir = "images",
# replen = ssr, loss = FALSE)
# }
This will create 20 populations with 20 samples and 10k SNPs. Each population will have:
It is important that I define what “mating” is here. When I talk about “mating”, I have a function that will sample with replacement parental pairs. These pairs will first have a crossover event before meiosis and then single chromosomes are randomly chosen from each parent and then randomly mutated to create the new offspring. Only one offspring per parent pair is created.
The functions that perform this are :
random_mate_gen()
random_mate()
mate()
crossover()pop_mutator()This is to account for the fact that when glSim simulates populations, the terminal branches are EXTREMELY long. This scheme seems to make the branches a bit more realistic.
In addition, half of these populations will have undergone one generation of clonal reproduction. Each sample has a unique 10 letter identification. Samples from clonal populations will have a number appended to the end. We will use the 10 letter identifier to detect clones.
set.seed(20150415)
x <- lapply(1:10, getSims, n = 40, snps = 1e4, strucrat = 1, ploidy = 2, err = 0.1, na.perc = 0.21, clone = TRUE, n.cores = 4, mate_gen = 20)
## Loading required package: parallel
y <- lapply(1:10, getSims, n = 40, snps = 1e4, strucrat = 1, ploidy = 2, err = 0.1, na.perc = 0.21, clone = FALSE, n.cores = 4, mate_gen = 20)
# x <- getSims(n = 200, snps = 1e4, strucrat = 1, ploidy = 2, err = 0.1, clone = TRUE, n.cores = 4)
# y <- getSims(n = 200, snps = 1e4, strucrat = 1, ploidy = 2, err = 0.1, clone = FALSE, n.cores = 4)
The samples are then pooled.
For analysis, \(\frac{1}{5}\)th of the pooled samples will be kept.
fstats gives the statistics from mlg.filter when the threshold is set to the maximum distance possible. Finding the largest difference between two threshold values in the upper 50% of the data serves as a rough prediction of the threshold at which clones should be collapsed.
z <- do.call("rbind", c(x, y))
z <- z[sample(nInd(z), nInd(z)/5)]
trueclones <- duplicated(substr(indNames(z), start = 1, stop = 10))
fstats <- filter_stats(z, bitwise.dist, plot = TRUE)
(the_threshold <- threshold_predictor(fstats$average$thresholds))
## [1] 0.077075
abline(v = the_threshold, lty = 2)
This predicted threshold is then used to compare the defined clones to the true clones as presented in a contingency table.
thresh <- duplicated(mlg.filter(z, distance = bitwise.dist,
threshold = the_threshold,
algorithm = "a"))
(threshtable <- table(thresh, trueclones))
## trueclones
## thresh FALSE TRUE
## FALSE 146 0
## TRUE 0 14
The tabulation is a power analysis of how many true and false positives there are when collapsing at the threshold that gives the same number of known clones/replicates.
Below is labelling a tree with known clones.
the_tree <- upgma(bitwise.dist(z))
clones <- substr(the_tree$tip.label[thresh], start = 1, stop = 10)
clones <- lapply(clones, grep, the_tree$tip.label)
edgelist <- length(which.edge(the_tree, the_tree$tip.label))
edgecols <- rep("black", edgelist)
for (i in clones){
edgecols[which.edge(the_tree, the_tree$tip.label[i])] <- "red"
}
plot.phylo(the_tree, edge.color = edgecols, adj = 0, label.offset = 0.001)
axisPhylo(1)
title("Random sequences with 10,000 SNPs and a 0.1 error rate")
For these, we will simulate 10,000 markers for populations of 200 samples each.
nreps <- 100
resarray <- array(data = integer(nreps*4), dim = c(2, 2, nreps),
dimnames = c(dimnames(threshtable), NULL))
neararray <- resarray
fararray <- resarray
avarray <- resarray
samplist <- lapply(1:nreps, function(x) list(samp = NULL, tree = NULL,
mlgs = NULL))
Sys.time()
## [1] "2015-05-04 16:04:46 PDT"
for (i in 1:nreps){
set.seed(i) # setting seed for accuracy.
snps <- 1e4
samp1 <- getSims(n = 200, snps = snps, strucrat = 0.5, ploidy = 2,
err = 0.1, na.perc = 0.1, clone = TRUE, n.cores = 4,
k = 10, pop.freq = rep(0.1, 10), alpha = 0.25)
samp2 <- getSims(n = 200, snps = snps, strucrat = 0.5, ploidy = 2,
err = 0.1, na.perc = 0.1, clone = FALSE, n.cores = 4,
k = 10, pop.freq = rep(0.1, 10), alpha = 0.25)
samp <- do.call("rbind", c(samp1, samp2))
samp@ploidy <- rep(2L, nInd(samp))
samp <- samp[sample(nInd(samp), nInd(samp)/5)]
trueclones <- duplicated(substr(indNames(samp), start = 1, stop = 10))
fstats <- filter_stats(samp, bitwise.dist, plot = TRUE)
title(paste("seed:", i, "n:", nInd(samp), "snps:", snps))
the_threshold <- threshold_predictor(fstats$average$thresholds)
the_distance <- bitwise.dist(samp)
z <- filter_stats(x = samp, distance = bitwise.dist,
threshold = the_threshold, stats = "MLGs")
abline(v = the_threshold, lty = 2)
text(the_threshold, 0,
labels = paste("Threshold:", signif(the_threshold, 3)),
adj = 0)
samplist[[i]]$samp <- samp
samplist[[i]]$tree <- upgma(the_distance)
samplist[[i]]$mlgs <- z
athresh <- duplicated(z$average)
nthresh <- duplicated(z$nearest)
fthresh <- duplicated(z$farthest)
avarray[, , i] <- table(athresh, trueclones)
avarray[, , i] <- sweep(avarray[, , i], 2, colSums(avarray[, , i]), "/")
neararray[, , i] <- table(nthresh, trueclones)
neararray[, , i] <- sweep(neararray[, , i], 2, colSums(neararray[, , i]), "/")
fararray[, , i] <- table(fthresh, trueclones)
fararray[, , i] <- sweep(fararray[, , i], 2, colSums(fararray[, , i]), "/")
}
systime <- Sys.time()
systime
## [1] "2015-05-04 16:40:31 PDT"
# color_mlg_tree(samp, upgma(bitwise.dist(samp)), z$average)
# axisPhylo(1)
Now we get to see how well we did.
(ares <- apply(avarray, 1:2, mean))
## trueclones
## thresh FALSE TRUE
## FALSE 0.998875 0.018875
## TRUE 0.001125 0.981125
(nres <- apply(neararray, 1:2, mean))
## trueclones
## thresh FALSE TRUE
## FALSE 0.998875 0.018875
## TRUE 0.001125 0.981125
(fres <- apply(fararray, 1:2, mean))
## trueclones
## thresh FALSE TRUE
## FALSE 0.998875 0.018875
## TRUE 0.001125 0.981125
| True Positive % | False Positive % | |
|---|---|---|
| farthest | 98.1 | 0.112 |
| average | 98.1 | 0.112 |
| nearest | 98.1 | 0.112 |
options(width = 100)
devtools::session_info()
## Session info ---------------------------------------------------------------------------------------
## setting value
## version R version 3.2.0 (2015-04-16)
## system x86_64, darwin13.4.0
## ui X11
## language (EN)
## collate en_US.UTF-8
## tz America/Los_Angeles
## Packages -------------------------------------------------------------------------------------------
## package * version date source
## ade4 1.7-2 2015-04-14 CRAN (R 3.2.0)
## adegenet 2.0.0 2015-05-04 Github (thibautjombart/adegenet@a230516)
## animation 2.3 2014-07-25 CRAN (R 3.2.0)
## ape 3.2 2014-12-05 CRAN (R 3.2.0)
## assertthat * 0.1 2013-12-06 CRAN (R 3.1.0)
## bitops * 1.0-6 2013-08-17 CRAN (R 3.1.0)
## boot * 1.3-16 2015-03-08 CRAN (R 3.2.0)
## cluster * 2.0.1 2015-01-31 CRAN (R 3.1.2)
## coda * 0.17-1 2015-03-03 CRAN (R 3.2.0)
## codetools * 0.2-11 2015-03-10 CRAN (R 3.2.0)
## colorspace * 1.2-6 2015-03-11 CRAN (R 3.2.0)
## DBI * 0.3.1 2014-09-24 CRAN (R 3.1.1)
## deldir * 0.1-9 2015-03-09 CRAN (R 3.2.0)
## devtools * 1.7.0 2015-01-17 CRAN (R 3.2.0)
## digest * 0.6.8 2014-12-31 CRAN (R 3.1.2)
## dplyr * 0.4.1 2015-01-14 CRAN (R 3.1.2)
## evaluate * 0.6 2015-04-13 CRAN (R 3.2.0)
## formatR * 1.1 2015-03-29 CRAN (R 3.2.0)
## ggplot2 * 1.0.1 2015-03-17 CRAN (R 3.2.0)
## gtable * 0.1.2 2012-12-05 CRAN (R 3.1.0)
## highr * 0.4.1 2015-03-29 CRAN (R 3.2.0)
## htmltools * 0.2.6 2014-09-08 CRAN (R 3.1.1)
## httpuv * 1.3.2 2014-10-23 CRAN (R 3.1.2)
## igraph * 0.7.1 2014-04-22 CRAN (R 3.1.0)
## knitr * 1.10 2015-04-23 CRAN (R 3.2.0)
## lattice * 0.20-31 2015-03-30 CRAN (R 3.2.0)
## LearnBayes * 2.15 2014-05-29 CRAN (R 3.1.0)
## magrittr * 1.5 2014-11-22 CRAN (R 3.1.2)
## MASS * 7.3-40 2015-03-21 CRAN (R 3.2.0)
## Matrix * 1.2-0 2015-04-04 CRAN (R 3.2.0)
## mgcv * 1.8-6 2015-03-31 CRAN (R 3.2.0)
## mime * 0.3 2015-03-29 CRAN (R 3.1.3)
## munsell * 0.4.2 2013-07-11 CRAN (R 3.1.0)
## nlme * 3.1-120 2015-02-20 CRAN (R 3.2.0)
## nnls * 1.4 2012-03-19 CRAN (R 3.1.2)
## pegas * 0.7-0.4 2015-05-04 Github (emmanuelparadis/pegas@883d74f)
## permute * 0.8-3 2014-01-29 CRAN (R 3.1.0)
## phangorn 1.99-14 2015-05-04 Github (KlausVigo/phangorn@cf4aef0)
## plyr * 1.8.1 2014-02-26 CRAN (R 3.1.0)
## poppr 1.1.4.99-407 2015-05-04 Github (grunwaldlab/poppr@f99f2a7)
## proto * 0.3-10 2012-12-22 CRAN (R 3.1.0)
## R6 * 2.0.1 2014-10-29 CRAN (R 3.1.2)
## RColorBrewer * 1.1-2 2014-12-07 CRAN (R 3.1.2)
## Rcpp * 0.11.5 2015-03-06 CRAN (R 3.2.0)
## RCurl * 1.95-4.5 2014-12-28 CRAN (R 3.1.2)
## reshape2 * 1.4.1 2014-12-06 CRAN (R 3.1.2)
## rmarkdown * 0.5.1 2015-01-26 CRAN (R 3.2.0)
## rstudioapi * 0.3.1 2015-04-07 CRAN (R 3.2.0)
## scales * 0.2.4 2014-04-22 CRAN (R 3.1.0)
## seqinr * 3.1-3 2014-12-17 CRAN (R 3.1.2)
## shiny * 0.11.1.9004 2015-04-16 Github (rstudio/shiny@02caf05)
## sp * 1.0-17 2015-01-08 CRAN (R 3.1.2)
## spdep * 0.5-88 2015-03-17 CRAN (R 3.2.0)
## stringr * 0.6.2 2012-12-06 CRAN (R 3.1.0)
## vegan * 2.2-1 2015-01-12 CRAN (R 3.1.2)
## xtable * 1.7-4 2014-09-12 CRAN (R 3.1.1)